library(phyloseq) # for phyloseq object
library(Biostrings)
library(ggplot2)
library(cowplot)
library(tidyverse)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap
prune_samples(sample_sums(physeq.agp)>=500, physeq.agp)
phyloseq-class experiment-level object
otu_table() OTU Table: [ 6391 taxa and 1190 samples ]
sample_data() Sample Data: [ 1190 samples by 25 sample variables ]
tax_table() Taxonomy Table: [ 6391 taxa by 7 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 6391 tips and 6389 internal nodes ]
refseq() DNAStringSet: [ 6391 reference sequences ]
Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.
# Look at the tree
plot_tree(physeq.agp, color = "host_disease", ladderize="left")
# Plot Phylum
plot_bar(physeq.agp, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
theme(axis.text.x = element_blank())+
labs(x = "Samples", y = "Absolute abundance", title = "AGP dataset")
Sequencing depth characteristics of the Zhuang dataset:
- minimum of 578 total count per sample
- median: 2.328410^{4} total count per sample
- maximum of 4.5417410^{5} total count per sample
Looking at it more closely, there is a high proportion of Proteobacteria compared to other datasets. Considering that fecal samples were shipped from the participants’ home at room temperature, there is evidence that some bacteria grow at room temperature, which would bias the microbial composition (https://journals.asm.org/doi/10.1128/mSystems.00199-16#B5). Authors from the AGP removed a selection of bloom sequences that were shared on their github repository in a FASTA file.
# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.filt)<500) # all FALSE
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.filt
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts
# Sanity check that 0 values have been replaced
# otu_table(physeq.filt)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]
# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1
# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_NZcomp.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.filt
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # divide each count by the total number of counts (per sample)
# check the counts are all relative
# otu_table(physeq.filt)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]
# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1
# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_relative.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.filt
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )
# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total
# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_CSN.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.filt
physeq.clr <- microbiome::transform(physeq.filt, "clr") # the function adds pseudocounts itself
# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
otu_table(physeq.filt)[1:5, 1:5] # should contain absolute counts
otu_table(physeq.clr)[1:5, 1:5] # should all be relative
# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_clr.rds"))
First, let’s look at these four distances of interest.
Now let’s plot!
# Get the distances & the plot data
dist.agp <- getDistances()
plot.df <- plotDistances2D(dist.agp)
# Plot
ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
geom_point(size=2, alpha=0.5) + scale_color_manual(values = c('blue', 'red'))+
facet_wrap(distance~., scales='free', nrow=1)+
theme_bw()+
theme(strip.text.x = element_text(size=20))+
labs(color="Disease")
# ggsave(file.path(path.plots, "distances4_MDS.jpg"), height = 4, width = 15)
For better visualization, we will also take a glance at reduction to 3D.
Now let’s plot!
plotDistances3D(dist.agp$UniF, "UniFrac")
plotDistances3D(dist.agp$Ait, "Aitchison")
plotDistances3D(dist.agp$Canb, "Canberra")
plotDistances3D(dist.agp$Bray, "Bray-Curtis")
# For heatmaps: have group color
matcol <- data.frame(group = sample_data(physeq.filt)[,"host_disease"])
# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
# Initialize variables
i=1
plist <- vector("list", 4)
names(plist) <- names(dlist)
# Loop through distances
for(d in dlist){
plist[[i]] <- pheatmap(as.matrix(d),
clustering_distance_rows = d,
clustering_distance_cols = d,
fontsize = fontsize,
fontsize_col = fontsize-5,
fontsize_row = fontsize-5,
annotation_col = matcol,
annotation_row = matcol,
annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red')),
cluster_rows = T,
cluster_cols = T,
clustering_method = 'complete', # hc method
main = names(dlist)[i]) # have name of distance as title
i <- i+1
}
return(plist)
}
# Get the heatmaps
heatmp.agp <- plotHeatmaps(dlist = dist.agp, fontsize = 8)